college_stats <- read_csv("raw data/cfbd_stats_season_player.csv")
teams <- read_csv("raw data/cfbd_team_info.csv")
draft_info <- read_csv("raw data/cfbd_draft_picks.csv")
draft_combine <- read_csv("raw data/load_combine.csv")
draft_nfl <- read_csv("raw data/load_draft_picks.csv")
rosters <- read_csv("raw data/load_rosters_weekly.csv")
num_schools <- college_stats %>%
group_by(athlete_id) %>%
summarize(num_schools = n_distinct(team))
gg_miss_var(college_stats)
missing_all <- college_stats %>%
select(-c("year", "team", "conference", "athlete_id", "player", "position")) %>%
filter(if_all(everything(), is.na))
0 observations missing all, so less concerned.
senior_stats <- college_stats %>%
group_by(athlete_id) %>%
filter(year == max(year, na.rm = TRUE)) %>%
ungroup() %>%
left_join(num_schools, by = "athlete_id")
senior_stats <- senior_stats %>%
mutate(position = ifelse(position %in% c("?", "ATH"), NA, position),
side_of_ball = case_when(position %in% c("WR", "TE", "RB", "QB", "FB",
"OL", "C", "OT", "G") ~ "Off",
position %in% c("DB", "CB", "S", "LB", "DL",
"DE", "ILB", "OLB", "DT",
"NT", "EDGE") ~ "Def",
position %in% c("PK", "P", "LS", "PR") ~ "ST",
TRUE ~ NA),
position_group = case_when(position %in% c("WR") ~ "WR",
position %in% c("DB", "CB", "S") ~ "DB",
position %in% c("PK", "P", "LS", "PR") ~ "ST",
position %in% c("TE") ~ "TE",
position %in% c("RB", "FB") ~ "RB",
position %in% c("LB", "ILB", "OLB") ~ "LB",
position %in% c("DL", "DE", "DT", "NT",
"EDGE") ~ "DL",
position %in% c("QB") ~ "QB",
position %in% c("OL", "C", "OT", "G") ~ "OL",
TRUE ~ NA))
check_dups <- senior_stats %>% # 7 players show up twice in senior_stats
count(athlete_id) %>%
filter(n > 1) %>%
select(athlete_id)
check_dups
## # A tibble: 7 × 1
## athlete_id
## <dbl>
## 1 4037217
## 2 4243817
## 3 4362128
## 4 4430675
## 5 4431348
## 6 4570184
## 7 4576147
senior_stats %>% filter(athlete_id %in% check_dups$athlete_id)
## # A tibble: 14 × 63
## year team conference athlete_id player position passing_completions
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 2021 Old Dominion Conferenc… 4037217 Brand… TE NA
## 2 2021 Wake Forest ACC 4037217 Brand… TE NA
## 3 2021 Florida Atla… Conferenc… 4243817 Diash… CB NA
## 4 2021 UTEP Conferenc… 4243817 Diash… CB NA
## 5 2021 UCF American … 4362128 Justi… LB NA
## 6 2021 UConn FBS Indep… 4362128 Justi… LB NA
## 7 2021 Oklahoma Big 12 4430675 Andre… RB NA
## 8 2021 Oklahoma Sta… Big 12 4430675 Andre… RB NA
## 9 2021 Arizona Pac-12 4431348 Malik… LB NA
## 10 2021 Colorado Pac-12 4431348 Malik… LB NA
## 11 2021 Fresno State Mountain … 4570184 Evan … PK NA
## 12 2021 UNLV Mountain … 4570184 Evan … PK NA
## 13 2021 Old Dominion Conferenc… 4576147 Colt … OL NA
## 14 2021 Western Kent… Conferenc… 4576147 Colt … OL NA
## # ℹ 56 more variables: passing_att <dbl>, passing_pct <dbl>, passing_yds <dbl>,
## # passing_td <dbl>, passing_int <dbl>, passing_ypa <dbl>, rushing_car <dbl>,
## # rushing_yds <dbl>, rushing_td <dbl>, rushing_ypc <dbl>, rushing_long <dbl>,
## # receiving_rec <dbl>, receiving_yds <dbl>, receiving_td <dbl>,
## # receiving_ypr <dbl>, receiving_long <dbl>, fumbles_fum <dbl>,
## # fumbles_rec <dbl>, fumbles_lost <dbl>, defensive_solo <dbl>,
## # defensive_tot <dbl>, defensive_tfl <dbl>, defensive_sacks <dbl>, …
## these players all marked as playing for an opponent in >= 1 game
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4037217" &
senior_stats$team == "Old Dominion"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4243817" &
senior_stats$team == "UTEP"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4362128" &
senior_stats$team == "UCF"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4430675" &
senior_stats$team == "Oklahoma"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4431348" &
senior_stats$team == "Colorado"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4570184" &
senior_stats$team == "Fresno State"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4576147" &
senior_stats$team == "Old Dominion"), ]
senior_stats <- senior_stats[!(senior_stats$athlete_id == "4680602" &
senior_stats$team == "Kentucky"), ]
check_dups <- senior_stats %>% # 7 players show up twice in senior_stats
count(athlete_id) %>%
filter(n > 1) %>%
select(athlete_id)
check_dups
## # A tibble: 0 × 1
## # ℹ 1 variable: athlete_id <dbl>
senior_stats <- senior_stats %>%
filter(position != "PR")
def_stats <- c("fumbles_rec", "defensive_solo", "defensive_tot", "defensive_tfl",
"defensive_sacks", "defensive_qb_hur", "interceptions_int",
"interceptions_yds", "interceptions_avg", "interceptions_td",
"defensive_pd", "defensive_td")
qb_stats <- c("passing_completions", "passing_att", "passing_pct", "passing_yds",
"passing_td", "passing_int", "passing_ypa", "rushing_car",
"rushing_yds", "rushing_td", "rushing_ypc", "rushing_long",
"fumbles_fum", "fumbles_rec", "fumbles_lost")
wr_rb_stats <- c("rushing_car", "rushing_yds", "rushing_td", "rushing_ypc",
"rushing_long", "receiving_rec", "receiving_yds", "receiving_td",
"receiving_ypr", "receiving_long", "fumbles_fum", "fumbles_rec",
"fumbles_lost", "punt_returns_no", "punt_returns_yds",
"punt_returns_avg", "punt_returns_td", "punt_returns_long")
st_stats <- c("kicking_fgm", "kicking_fga", "kicking_pct", "kicking_xpa",
"kicking_xpm", "kicking_pts", "kicking_long", "punting_no",
"punting_yds", "punting_ypp", "punting_long", "punting_in_20",
"punting_tb")
def_stats_df <- senior_stats %>%
filter(side_of_ball == "Def") %>%
select(year, team, conference, athlete_id, player, position, position_group,
all_of(def_stats))
qb_stats_df <- senior_stats %>%
filter(position == "QB") %>%
select(year, team, conference, athlete_id, player, position, position_group,
all_of(qb_stats))
wr_rb_stats_df <- senior_stats %>%
filter(position %in% c("WR", "TE", "RB", "FB")) %>%
select(year, team, conference, athlete_id, player, position, position_group,
all_of(wr_rb_stats))
st_stats_df <- senior_stats %>%
filter(side_of_ball == "ST") %>%
select(year, team, conference, athlete_id, player, position, position_group,
all_of(st_stats))
get_mode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
def_stats_avg <- def_stats_df %>%
group_by(position) %>%
select(-year, -team, -conference, -athlete_id, -player, -position_group) %>%
summarise(across(everything(), ~mean(.x, na.rm = TRUE)))
qb_stats_avg <- qb_stats_df %>%
group_by(position) %>%
select(-year, -team, -conference, -athlete_id, -player, -position_group) %>%
summarise(across(everything(), ~mean(.x, na.rm = TRUE)))
wr_rb_stats_avg <- wr_rb_stats_df %>%
group_by(position) %>%
select(-year, -team, -conference, -athlete_id, -player, -position_group) %>%
summarise(across(everything(), ~mean(.x, na.rm = TRUE)))
st_stats_avg <- st_stats_df %>%
group_by(position) %>%
select(-year, -team, -conference, -athlete_id, -player, -position_group) %>%
summarise(across(everything(), ~mean(.x, na.rm = TRUE)))
st_flags <- st_stats_df %>%
left_join(st_stats_avg, by = "position", suffix = c("", "_pos_avg")) %>%
mutate(across(
all_of(st_stats),
~ case_when(
.x > get(paste0(cur_column(), "_pos_avg")) ~ "above",
.x < get(paste0(cur_column(), "_pos_avg")) ~ "below",
TRUE ~ "at"
),
.names = "{.col}_flag"
)) %>%
select(athlete_id, position, ends_with("_flag")) %>%
rowwise() %>%
mutate(pos_skill_flag = get_mode(c_across(ends_with("_flag")))) %>%
ungroup()
wr_rb_flags <- wr_rb_stats_df %>%
left_join(wr_rb_stats_avg, by = "position", suffix = c("", "_pos_avg")) %>%
mutate(across(
all_of(wr_rb_stats),
~ case_when(
.x > get(paste0(cur_column(), "_pos_avg")) ~ "above",
.x < get(paste0(cur_column(), "_pos_avg")) ~ "below",
TRUE ~ "at"
),
.names = "{.col}_flag"
)) %>%
select(athlete_id, position, ends_with("_flag")) %>%
rowwise() %>%
mutate(pos_skill_flag = get_mode(c_across(ends_with("_flag")))) %>%
ungroup()
def_flags <- def_stats_df %>%
left_join(def_stats_avg, by = "position", suffix = c("", "_pos_avg")) %>%
mutate(across(
all_of(def_stats),
~ case_when(
.x > get(paste0(cur_column(), "_pos_avg")) ~ "above",
.x < get(paste0(cur_column(), "_pos_avg")) ~ "below",
TRUE ~ "at"
),
.names = "{.col}_flag"
)) %>%
select(athlete_id, position, ends_with("_flag")) %>%
rowwise() %>%
mutate(pos_skill_flag = get_mode(c_across(ends_with("_flag")))) %>%
ungroup()
qb_flags <- qb_stats_df %>%
left_join(qb_stats_avg, by = "position", suffix = c("", "_pos_avg")) %>%
mutate(across(
all_of(qb_stats),
~ case_when(
.x > get(paste0(cur_column(), "_pos_avg")) ~ "above",
.x < get(paste0(cur_column(), "_pos_avg")) ~ "below",
TRUE ~ "at"
),
.names = "{.col}_flag"
)) %>%
select(athlete_id, position, ends_with("_flag")) %>%
rowwise() %>%
mutate(pos_skill_flag = get_mode(c_across(ends_with("_flag")))) %>%
ungroup()
senior_stats <- senior_stats %>%
left_join(qb_flags %>% select(athlete_id,
pos_skill_flag_qb = pos_skill_flag),
by = "athlete_id") %>%
left_join(wr_rb_flags %>% select(athlete_id,
pos_skill_flag_wr_rb = pos_skill_flag),
by = "athlete_id") %>%
left_join(def_flags %>% select(athlete_id,
pos_skill_flag_def = pos_skill_flag),
by = "athlete_id") %>%
left_join(st_flags %>% select(athlete_id,
pos_skill_flag_st = pos_skill_flag),
by = "athlete_id") %>%
mutate(pos_skill_flag = coalesce(pos_skill_flag_qb, pos_skill_flag_wr_rb,
pos_skill_flag_def, pos_skill_flag_st)) %>%
select(-pos_skill_flag_qb, -pos_skill_flag_wr_rb, -pos_skill_flag_def,
-pos_skill_flag_st)
d3_conf <- teams %>%
filter(classification == "iii") %>%
distinct(conference) %>%
pull(conference)
d2_conf <- teams %>%
filter(classification == "ii") %>%
distinct(conference) %>%
pull(conference)
fcs_conf <- teams %>%
filter(classification == "fcs") %>%
distinct(conference) %>%
pull(conference)
fbs_conf <- teams %>%
filter(classification == "fbs") %>%
distinct(conference) %>%
pull(conference)
senior_stats <- senior_stats %>%
mutate(div_level = case_when(
conference %in% fbs_conf ~ "D1 FBS",
conference %in% fcs_conf ~ "D1 FCS",
conference %in% d2_conf ~ "D2",
conference %in% d3_conf ~ "D3",
TRUE ~ NA),
comp_level = case_when(
conference %in% c("Pac-12", "Big 12", "Big Ten", "ACC", "SEC") ~ "P5",
team == "Notre Dame" ~ "P5",
conference %in% c("American Athletic", "Conference USA", "Mid-American",
"Mountain West", "Sun Belt") ~ "G5",
team %in% c("Army", "BYU", "Massachusetts", "Liberty", "New Mexico State",
"UConn", "Navy") ~ "G5",
conference %in% c("MVFC", "CAA", "Big Sky") ~ "FCS Power",
conference %in% c("UAC", "SWAC", "Ivy", "Patriot", "Pioneer", "NEC",
"Big South-OVC", "Southern", "MEAC", "Southland",
"FCS Independents", "Big South", "OVC", "AWC",
"Western Athletic", "Atlantic Sun") ~ "FCS",
div_level == "D2" ~ "D2",
div_level == "D3" ~ "D3",
TRUE ~ NA))
analytic_v1 <- senior_stats %>%
select(year, team, conference, athlete_id, player, position, num_schools,
side_of_ball, position_group, div_level, comp_level, pos_skill_flag)
missing_flag <- analytic_v1 %>%
filter(is.na(pos_skill_flag)) ## all OL so will drop
analytic_v1 <- analytic_v1 %>%
filter(position_group != "OL")
gg_miss_var(analytic_v1)
write_csv(analytic_v1, "data processing/cfb_analytic_v1.csv")
rookies <- rosters %>%
filter(season == rookie_year, game_type == "REG", status == "ACT",
!(position %in% c("LS", "T", "G", "C", "OL"))) %>%
select(season, team, position, status, full_name, first_name, last_name,
birth_date, height, weight, gsis_id, espn_id, week, draft_club,
draft_number)
missing_ids <- rookies %>%
filter(is.na(espn_id)) %>%
distinct(full_name, season, team)
## some NA IDs we have college info for, some we don't
weeks_active <- rookies %>%
group_by(gsis_id, espn_id, team) %>%
summarize(weeks_active_team = n(), .groups = "drop") %>%
group_by(gsis_id, espn_id) %>%
arrange(desc(weeks_active_team), .by_group = TRUE) %>%
mutate(weeks_active = sum(weeks_active_team),
num_teams = n_distinct(team),
primary_team = first(team),
multiple_teams = ifelse(num_teams > 1, 1, 0)) %>%
slice_head() %>%
ungroup() %>%
select(gsis_id, espn_id, primary_team, weeks_active, multiple_teams)
rookie_success_v1 <- rookies %>%
distinct(gsis_id) %>%
left_join(weeks_active, by = "gsis_id") %>%
mutate(success_v1 = ifelse(weeks_active > 8, 1, 0))
rookie_success_v1 <- rookie_success_v1 %>%
left_join(rookies %>% select(gsis_id, season, full_name, position),
by = "gsis_id") %>%
mutate(name_id = tolower(full_name)) %>%
distinct(gsis_id, weeks_active, .keep_all = TRUE) %>%
rename(primary_roster_team = primary_team)
missing_ids <- rookie_success_v1 %>%
filter(is.na(espn_id))
draft_combine <- draft_combine %>%
select(season, draft_year, draft_team, draft_round, draft_ovr, pfr_id,
player_name, ht, wt, forty, bench, vertical, broad_jump, cone,
shuttle)
ids <- draft_nfl %>%
select(gsis_id, pfr_player_id, round, pick, age, dr_av, team)
full_draft <- ids %>%
mutate(gsis_id = ifelse(pfr_player_id == "YounBy01", "00-0039137", gsis_id))
draft_combine_rookie_success_v1 <- rookie_success_v1 %>%
left_join(full_draft, by = c("gsis_id")) %>%
filter(!is.na(gsis_id))
full_draft %>%
group_by(gsis_id) %>%
count(gsis_id) %>%
filter(n > 1)
## # A tibble: 1 × 2
## # Groups: gsis_id [1]
## gsis_id n
## <chr> <int>
## 1 <NA> 573
draft_combine_rookie_success_v1 %>%
group_by(gsis_id) %>%
count(gsis_id) %>%
filter(n > 1)
## # A tibble: 0 × 2
## # Groups: gsis_id [0]
## # ℹ 2 variables: gsis_id <chr>, n <int>
draft_combine_rookie_success_v1 <- draft_combine_rookie_success_v1 %>%
rename(draft_round = round,
draft_pick = pick,
draft_team = team)
draft_combine_rookie_success_v1 <- draft_combine_rookie_success_v1 %>%
mutate(
primary_roster_team = case_when(
primary_roster_team == "OAK" ~ "LV",
primary_roster_team == "SD" ~ "LAC",
primary_roster_team == "BLT" ~ "BAL",
primary_roster_team == "SL" ~ "LA",
primary_roster_team == "HST" ~ "HOU",
primary_roster_team == "CLV" ~ "CLE",
primary_roster_team == "ARZ" ~ "ARI",
TRUE ~ primary_roster_team))
write_csv(draft_combine_rookie_success_v1,
"data processing/nfl_analytic_v1.csv")
college <- analytic_v1
nfl <- draft_combine_rookie_success_v1
rm(list = setdiff(ls(), c("college", "nfl")))
team_schedules <- read_csv("raw data/load_schedules.csv")
home_results <- team_schedules %>%
filter(game_type == "REG") %>%
mutate(
result = case_when(
home_score > away_score ~ "W",
home_score < away_score ~ "L",
home_score == away_score ~ "T"
),
team = home_team,
opponent = away_team
) %>%
select(season, week, team, opponent, result, home_score, away_score)
away_results <- team_schedules %>%
filter(game_type == "REG") %>%
mutate(
result = case_when(
away_score > home_score ~ "W",
away_score < home_score ~ "L",
away_score == home_score ~ "T"
),
team = away_team,
opponent = home_team
) %>%
select(season, week, team, opponent, result, away_score, home_score)
count(home_results, result)
## # A tibble: 3 × 2
## result n
## <chr> <int>
## 1 L 1295
## 2 T 10
## 3 W 1574
count(away_results, result)
## # A tibble: 3 × 2
## result n
## <chr> <int>
## 1 L 1574
## 2 T 10
## 3 W 1295
home_team_records <- home_results %>%
group_by(season, team) %>%
summarise(
wins = sum(result == "W"),
losses = sum(result == "L"),
ties = sum(result == "T")
) %>%
ungroup()
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
away_team_records <- away_results %>%
group_by(season, team) %>%
summarise(
wins = sum(result == "W"),
losses = sum(result == "L"),
ties = sum(result == "T")
) %>%
ungroup()
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
team_records <- home_team_records %>%
full_join(away_team_records, by = c("season", "team"),
suffix = c("_home", "_away")) %>%
mutate(
wins = coalesce(wins_home, 0) + coalesce(wins_away, 0),
losses = coalesce(losses_home, 0) + coalesce(losses_away, 0),
ties = coalesce(ties_home, 0) + coalesce(ties_away, 0)
) %>%
select(season, team, wins, losses, ties)
team_records %>%
mutate(total_games = wins + losses + ties) %>%
filter(total_games != 16 & season %in% c(2015:2020))
## # A tibble: 0 × 6
## # ℹ 6 variables: season <dbl>, team <chr>, wins <dbl>, losses <dbl>,
## # ties <dbl>, total_games <dbl>
team_records %>%
mutate(total_games = wins + losses + ties) %>%
filter(total_games != 17 & season %in% c(2021:2024))
## # A tibble: 2 × 6
## season team wins losses ties total_games
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2022 BUF 13 3 0 16
## 2 2022 CIN 12 4 0 16
BUF @ CIN game was cancelled in 2022 so 16 is valid for them
team_wp <- team_records %>%
mutate(win_percentage = wins / (wins + losses + ties),
team = case_when(
team == "OAK" ~ "LV",
team == "SD" ~ "LAC",
team == "STL" ~ "LA",
TRUE ~ team
))
write_csv(team_wp, "data processing/nfl_team_win_percentage.csv")
head(college) %>% print(width = Inf)
## # A tibble: 6 × 12
## year team conference athlete_id player position
## <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 2014 Air Force Mountain West 109907 Jon Lee RB
## 2 2014 Akron Mid-American 147386 Josh Smith WR
## 3 2014 Norfolk State MEAC 2026567 Damian Smith WR
## 4 2014 Nebraska Big Ten 3116088 Byerson Cockrell S
## 5 2014 West Virginia Big 12 3116459 Dravon Henry S
## 6 2014 Louisville ACC 3116652 Chris Miele WR
## num_schools side_of_ball position_group div_level comp_level pos_skill_flag
## <int> <chr> <chr> <chr> <chr> <chr>
## 1 1 Off RB D1 FBS G5 below
## 2 1 Off WR D1 FBS G5 at
## 3 1 Off WR D1 FCS FCS at
## 4 1 Def DB D1 FBS P5 at
## 5 1 Def DB D1 FBS P5 at
## 6 1 Off WR D1 FBS P5 at
str(college)
## tibble [47,428 × 12] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:47428] 2014 2014 2014 2014 2014 ...
## $ team : chr [1:47428] "Air Force" "Akron" "Norfolk State" "Nebraska" ...
## $ conference : chr [1:47428] "Mountain West" "Mid-American" "MEAC" "Big Ten" ...
## $ athlete_id : num [1:47428] 109907 147386 2026567 3116088 3116459 ...
## $ player : chr [1:47428] "Jon Lee" "Josh Smith" "Damian Smith" "Byerson Cockrell" ...
## $ position : chr [1:47428] "RB" "WR" "WR" "S" ...
## $ num_schools : int [1:47428] 1 1 1 1 1 1 1 1 1 1 ...
## $ side_of_ball : chr [1:47428] "Off" "Off" "Off" "Def" ...
## $ position_group: chr [1:47428] "RB" "WR" "WR" "DB" ...
## $ div_level : chr [1:47428] "D1 FBS" "D1 FBS" "D1 FCS" "D1 FBS" ...
## $ comp_level : chr [1:47428] "G5" "G5" "FCS" "P5" ...
## $ pos_skill_flag: chr [1:47428] "below" "at" "at" "at" ...
skim(college)
| Name | college |
| Number of rows | 47428 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| team | 0 | 1 | 3 | 26 | 0 | 306 | 0 |
| conference | 0 | 1 | 3 | 27 | 0 | 44 | 0 |
| player | 0 | 1 | 5 | 31 | 0 | 45287 | 0 |
| position | 0 | 1 | 1 | 4 | 0 | 19 | 0 |
| side_of_ball | 0 | 1 | 2 | 3 | 0 | 3 | 0 |
| position_group | 0 | 1 | 2 | 2 | 0 | 8 | 0 |
| div_level | 0 | 1 | 2 | 6 | 0 | 4 | 0 |
| comp_level | 0 | 1 | 2 | 9 | 0 | 6 | 0 |
| pos_skill_flag | 0 | 1 | 2 | 5 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1 | 2020.51 | 3.23 | 2014 | 2018 | 2021 | 2024 | 2024 | ▃▃▂▃▇ |
| athlete_id | 0 | 1 | 3895089.96 | 1433266.37 | 13223 | 3917911 | 4367020 | 4713129 | 5240138 | ▂▁▁▂▇ |
| num_schools | 0 | 1 | 1.17 | 0.41 | 1 | 1 | 1 | 1 | 4 | ▇▁▁▁▁ |
head(nfl) %>% print(width = Inf)
## # A tibble: 6 × 16
## gsis_id espn_id primary_roster_team weeks_active multiple_teams success_v1
## <chr> <dbl> <chr> <int> <dbl> <dbl>
## 1 00-0031474 NA NO 17 0 1
## 2 00-0031480 NA NYG 17 0 1
## 3 00-0031491 2265764 HOU 14 0 1
## 4 00-0031492 2473037 JAX 17 0 1
## 5 00-0031493 3423412 SF 9 0 1
## 6 00-0031502 2472364 PIT 17 0 1
## season full_name position name_id pfr_player_id draft_round
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 2015 Delvin Breaux CB delvin breaux <NA> NA
## 2 2015 Uani 'Unga MLB uani 'unga <NA> NA
## 3 2015 Brian Peters LB brian peters <NA> NA
## 4 2015 Jason Myers K jason myers <NA> NA
## 5 2015 Jarryd Hayne RB jarryd hayne <NA> NA
## 6 2015 Jordan Berry P jordan berry <NA> NA
## draft_pick age dr_av draft_team
## <dbl> <dbl> <dbl> <chr>
## 1 NA NA NA <NA>
## 2 NA NA NA <NA>
## 3 NA NA NA <NA>
## 4 NA NA NA <NA>
## 5 NA NA NA <NA>
## 6 NA NA NA <NA>
str(nfl)
## tibble [3,394 × 16] (S3: tbl_df/tbl/data.frame)
## $ gsis_id : chr [1:3394] "00-0031474" "00-0031480" "00-0031491" "00-0031492" ...
## $ espn_id : num [1:3394] NA NA 2265764 2473037 3423412 ...
## $ primary_roster_team: chr [1:3394] "NO" "NYG" "HOU" "JAX" ...
## $ weeks_active : int [1:3394] 17 17 14 17 9 17 17 7 17 2 ...
## $ multiple_teams : num [1:3394] 0 0 0 0 0 0 0 0 0 0 ...
## $ success_v1 : num [1:3394] 1 1 1 1 1 1 1 0 1 0 ...
## $ season : num [1:3394] 2015 2015 2015 2015 2015 ...
## $ full_name : chr [1:3394] "Delvin Breaux" "Uani 'Unga" "Brian Peters" "Jason Myers" ...
## $ position : chr [1:3394] "CB" "MLB" "LB" "K" ...
## $ name_id : chr [1:3394] "delvin breaux" "uani 'unga" "brian peters" "jason myers" ...
## $ pfr_player_id : chr [1:3394] NA NA NA NA ...
## $ draft_round : num [1:3394] NA NA NA NA NA NA 1 NA NA NA ...
## $ draft_pick : num [1:3394] NA NA NA NA NA NA 1 NA NA NA ...
## $ age : num [1:3394] NA NA NA NA NA NA 21 NA NA NA ...
## $ dr_av : num [1:3394] NA NA NA NA NA NA 54 NA NA NA ...
## $ draft_team : chr [1:3394] NA NA NA NA ...
skim(nfl)
| Name | nfl |
| Number of rows | 3394 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| gsis_id | 0 | 1.00 | 10 | 10 | 0 | 3394 | 0 |
| primary_roster_team | 0 | 1.00 | 2 | 3 | 0 | 32 | 0 |
| full_name | 0 | 1.00 | 6 | 24 | 0 | 3380 | 0 |
| position | 0 | 1.00 | 1 | 3 | 0 | 20 | 0 |
| name_id | 0 | 1.00 | 6 | 24 | 0 | 3380 | 0 |
| pfr_player_id | 1381 | 0.59 | 8 | 8 | 0 | 2013 | 0 |
| draft_team | 1381 | 0.59 | 3 | 3 | 0 | 35 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| espn_id | 923 | 0.73 | 3674771.05 | 705438.72 | 2265764 | 3047504 | 3915511 | 4360252 | 5278092 | ▂▇▅▆▂ |
| weeks_active | 0 | 1.00 | 10.63 | 5.75 | 1 | 5 | 13 | 16 | 17 | ▃▂▂▂▇ |
| multiple_teams | 0 | 1.00 | 0.02 | 0.15 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| success_v1 | 0 | 1.00 | 0.64 | 0.48 | 0 | 0 | 1 | 1 | 1 | ▅▁▁▁▇ |
| season | 0 | 1.00 | 2019.86 | 3.12 | 2015 | 2017 | 2020 | 2023 | 2025 | ▇▅▅▅▅ |
| draft_round | 1381 | 0.59 | 3.93 | 1.91 | 1 | 2 | 4 | 6 | 7 | ▇▅▅▅▇ |
| draft_pick | 1381 | 0.59 | 120.80 | 71.77 | 1 | 59 | 118 | 179 | 262 | ▇▇▇▇▅ |
| age | 1381 | 0.59 | 22.32 | 0.96 | 20 | 22 | 22 | 23 | 27 | ▃▇▇▁▁ |
| dr_av | 1460 | 0.57 | 9.45 | 13.15 | -1 | 1 | 5 | 13 | 106 | ▇▁▁▁▁ |
nfl <- nfl %>%
mutate(drafted = ifelse(is.na(draft_round), 0, 1))
college %>% count(year) %>% arrange(desc(n))
## # A tibble: 11 × 2
## year n
## <dbl> <int>
## 1 2024 13562
## 2 2023 4926
## 3 2022 4890
## 4 2019 4094
## 5 2018 3986
## 6 2017 3862
## 7 2016 3419
## 8 2021 3419
## 9 2014 1952
## 10 2015 1770
## 11 2020 1548
college %>% count(team) %>% arrange(desc(n))
## # A tibble: 306 × 2
## team n
## <chr> <int>
## 1 Air Force 303
## 2 Army 285
## 3 Akron 281
## 4 BYU 278
## 5 Texas State 274
## 6 UTEP 270
## 7 UTSA 267
## 8 Navy 266
## 9 Louisiana Tech 263
## 10 Southern Miss 261
## # ℹ 296 more rows
college %>% count(conference) %>% arrange(desc(n))
## # A tibble: 44 × 2
## conference n
## <chr> <int>
## 1 Big Ten 3291
## 2 SEC 3221
## 3 ACC 3130
## 4 Conference USA 3053
## 5 American Athletic 3006
## 6 Sun Belt 2968
## 7 Mountain West 2893
## 8 Mid-American 2821
## 9 Big 12 2692
## 10 Pac-12 2001
## # ℹ 34 more rows
college %>% count(position) %>% arrange(desc(n))
## # A tibble: 19 × 2
## position n
## <chr> <int>
## 1 WR 8348
## 2 LB 6622
## 3 DB 5652
## 4 RB 5122
## 5 DL 4833
## 6 TE 2900
## 7 QB 2833
## 8 S 2472
## 9 CB 2093
## 10 PK 1695
## 11 DE 1592
## 12 DT 1090
## 13 P 969
## 14 LS 567
## 15 FB 302
## 16 OLB 106
## 17 ILB 91
## 18 NT 74
## 19 EDGE 67
college %>% count(position_group) %>% arrange(desc(n))
## # A tibble: 8 × 2
## position_group n
## <chr> <int>
## 1 DB 10217
## 2 WR 8348
## 3 DL 7656
## 4 LB 6819
## 5 RB 5424
## 6 ST 3231
## 7 TE 2900
## 8 QB 2833
college %>% count(num_schools) %>% arrange(desc(n))
## # A tibble: 4 × 2
## num_schools n
## <int> <int>
## 1 1 40208
## 2 2 6556
## 3 3 643
## 4 4 21
college %>% count(side_of_ball) %>% arrange(desc(n))
## # A tibble: 3 × 2
## side_of_ball n
## <chr> <int>
## 1 Def 24692
## 2 Off 19505
## 3 ST 3231
college %>% count(div_level) %>% arrange(desc(n))
## # A tibble: 4 × 2
## div_level n
## <chr> <int>
## 1 D1 FBS 30256
## 2 D1 FCS 17058
## 3 D2 112
## 4 D3 2
college %>% count(comp_level) %>% arrange(desc(n))
## # A tibble: 6 × 2
## comp_level n
## <chr> <int>
## 1 G5 15732
## 2 P5 14550
## 3 FCS 11707
## 4 FCS Power 5325
## 5 D2 112
## 6 D3 2
college %>% count(pos_skill_flag) %>% arrange(desc(n))
## # A tibble: 3 × 2
## pos_skill_flag n
## <chr> <int>
## 1 below 20888
## 2 at 20226
## 3 above 6314
nfl %>% count(success_v1) %>% arrange(desc(n))
## # A tibble: 2 × 2
## success_v1 n
## <dbl> <int>
## 1 1 2165
## 2 0 1229
nfl %>% count(primary_roster_team) %>% arrange(desc(n))
## # A tibble: 32 × 2
## primary_roster_team n
## <chr> <int>
## 1 CLE 132
## 2 GB 127
## 3 LA 125
## 4 LAC 121
## 5 IND 120
## 6 LV 118
## 7 NYJ 117
## 8 JAX 115
## 9 NYG 114
## 10 WAS 112
## # ℹ 22 more rows
nfl %>% count(weeks_active) %>% arrange(desc(n))
## # A tibble: 17 × 2
## weeks_active n
## <int> <int>
## 1 16 801
## 2 17 396
## 3 1 316
## 4 14 232
## 5 2 188
## 6 3 150
## 7 15 149
## 8 13 142
## 9 4 132
## 10 7 122
## 11 5 117
## 12 12 114
## 13 9 112
## 14 10 112
## 15 11 107
## 16 6 102
## 17 8 102
nfl %>% count(multiple_teams) %>% arrange(desc(n))
## # A tibble: 2 × 2
## multiple_teams n
## <dbl> <int>
## 1 0 3313
## 2 1 81
nfl %>% count(season) %>% arrange(desc(n))
## # A tibble: 11 × 2
## season n
## <dbl> <int>
## 1 2016 423
## 2 2018 328
## 3 2022 328
## 4 2017 324
## 5 2020 319
## 6 2019 299
## 7 2023 297
## 8 2024 283
## 9 2025 277
## 10 2021 276
## 11 2015 240
nfl %>% count(position) %>% arrange(desc(n))
## # A tibble: 20 × 2
## position n
## <chr> <int>
## 1 DB 798
## 2 LB 576
## 3 WR 548
## 4 DL 505
## 5 RB 374
## 6 TE 238
## 7 QB 142
## 8 K 52
## 9 P 43
## 10 CB 32
## 11 DE 18
## 12 DT 14
## 13 OLB 14
## 14 ILB 9
## 15 NT 8
## 16 MLB 7
## 17 S 5
## 18 FS 4
## 19 SS 4
## 20 FB 3
nfl %>% count(draft_round) %>% arrange(desc(n))
## # A tibble: 8 × 2
## draft_round n
## <dbl> <int>
## 1 NA 1381
## 2 3 323
## 3 4 315
## 4 5 313
## 5 6 286
## 6 2 278
## 7 1 272
## 8 7 226
nfl %>% count(draft_pick) %>% arrange(desc(n))
## # A tibble: 262 × 2
## draft_pick n
## <dbl> <int>
## 1 NA 1381
## 2 1 11
## 3 2 11
## 4 30 11
## 5 40 11
## 6 47 11
## 7 52 11
## 8 60 11
## 9 113 11
## 10 165 11
## # ℹ 252 more rows
nfl %>% count(draft_team) %>% arrange(desc(n))
## # A tibble: 36 × 2
## draft_team n
## <chr> <int>
## 1 <NA> 1381
## 2 GNB 76
## 3 CLE 72
## 4 JAX 71
## 5 LAR 71
## 6 BAL 70
## 7 SFO 69
## 8 DAL 67
## 9 DEN 67
## 10 WAS 67
## # ℹ 26 more rows
nfl %>% count(drafted) %>% arrange(desc(n))
## # A tibble: 2 × 2
## drafted n
## <dbl> <int>
## 1 1 2013
## 2 0 1381
nfl %>% count(age) %>% arrange(desc(n))
## # A tibble: 9 × 2
## age n
## <dbl> <int>
## 1 NA 1381
## 2 22 760
## 3 23 650
## 4 21 397
## 5 24 167
## 6 25 24
## 7 20 11
## 8 26 3
## 9 27 1
nfl %>% count(dr_av) %>% arrange(desc(n))
## # A tibble: 76 × 2
## dr_av n
## <dbl> <int>
## 1 NA 1460
## 2 0 384
## 3 1 207
## 4 2 167
## 5 3 102
## 6 5 99
## 7 4 91
## 8 6 79
## 9 7 58
## 10 8 52
## # ℹ 66 more rows
gg_miss_var(college) +
labs(title = "Missing values in college data")
gg_miss_var(nfl) +
labs(title = "Missing values in nfl data")
Missing data in NFL makes sense if they were undrafted. Will need to
figure out way to handle missing espn_id because that’s a
lot
name_idcollege <- college %>%
filter(!(position %in% c("LS"))) %>%
mutate(name_id_college = player %>%
str_to_lower() %>%
str_replace_all("[^a-z]", ""),
name_season_id_college = paste0(name_id_college, "-", year),
first_name = tolower(str_extract(player, "^[^ ]+")),
last_name = tolower(str_extract(player, "[^ ]+$")),
athlete_id = as.character(athlete_id)) %>%
rename(position_college = position)
nfl <- nfl %>%
filter(!(position %in% c("G", "T", "C", "OL", "LS"))) %>%
mutate(college_year = season - 1,
name_id = full_name %>%
str_to_lower() %>%
str_replace_all("[^a-z]", ""),
name_season_id = paste0(name_id, "-", college_year),
espn_id = as.character(espn_id))
name_id counts & duplicatescollege_dups <- college %>%
count(name_id_college) %>%
filter(n > 1) %>%
left_join(college, by = "name_id_college")
nfl_dups <- nfl %>%
count(name_id) %>%
filter(n > 1) %>%
left_join(nfl, by = "name_id")
espn_idmerged_df_id <- nfl %>%
filter(!is.na(espn_id)) %>%
left_join(college %>% filter(!is.na(athlete_id)),
by = c("espn_id" = "athlete_id")) %>%
filter(!is.na(position_college)) ## espn_id only works for 1294 players
## check possible incorrect matches
merged_df_id %>%
filter(name_id != name_id_college) %>%
select(season, name_id, name_id_college, draft_team, team) %>%
arrange(season)
## # A tibble: 119 × 5
## season name_id name_id_college draft_team team
## <dbl> <chr> <chr> <chr> <chr>
## 1 2016 mikethomas michaelthomas LAR Southern Miss
## 2 2018 simmiecobbs simmiecobbsjr <NA> Indiana
## 3 2018 mikebadgley michaelbadgley <NA> Miami
## 4 2018 jeffwilson jefferywilson <NA> North Texas
## 5 2018 jakemartin jacobmartin SEA Temple
## 6 2018 olaadeniyi olasunkanmiadeniyi <NA> Toledo
## 7 2018 dorancearmstrong dorancearmstrongjr DAL Kansas
## 8 2018 kammoore kamrinmoore NOR Boston College
## 9 2018 buddyhowell gregoryhowelljr <NA> Florida Atlantic
## 10 2018 zeketurner ezekielturner <NA> Washington
## # ℹ 109 more rows
## all same player, just different version of name
nfl_dups_name_season <- nfl_dups %>%
count(name_season_id) %>%
filter(n > 1) %>%
left_join(nfl_dups %>% select(name_id, gsis_id, espn_id,
draft_team, name_season_id),
by = "name_season_id")
college_dups_name_season <- college_dups %>%
count(name_season_id_college) %>%
filter(n > 1) %>%
left_join(college_dups %>% select(name_id_college, athlete_id,
team, name_season_id_college),
by = "name_season_id_college")
merged_df_id <- nfl %>%
filter(!is.na(espn_id)) %>%
left_join(college %>% filter(!is.na(athlete_id)),
by = c("espn_id" = "athlete_id")) %>%
filter(!is.na(position_college)) %>%
mutate(match_type = "1 - ID match")
merged_df_name <- nfl %>%
filter(!(name_id %in% nfl_dups$name_id),
!(gsis_id %in% merged_df_id$gsis_id)) %>%
left_join(college %>% filter(!(name_id_college %in% college_dups$name_id_college)),
by = c("name_id" = "name_id_college")) %>%
filter(!is.na(position_college)) %>%
mutate(match_type = "2 - Name match")
merged_df_name_season <- nfl %>%
filter(!(gsis_id %in% merged_df_id$gsis_id),
!(gsis_id %in% merged_df_name$gsis_id),
!(name_season_id %in% college_dups_name_season$name_season_id_college),
!(name_season_id %in% nfl_dups_name_season$name_season_id)) %>%
left_join(college, by = c("name_season_id" = "name_season_id_college")) %>%
filter(!is.na(position_college)) %>%
mutate(match_type = "3 - Name-Season match")
check_matches <- nfl %>%
mutate(match_type = case_when(
gsis_id %in% merged_df_id$gsis_id ~ "1 - ID match",
gsis_id %in% merged_df_name$gsis_id ~ "2 - Name match",
gsis_id %in% merged_df_name_season$gsis_id ~ "3 - Name-Season match",
TRUE ~ "0 - Unmatched"
))
count(check_matches, match_type)
## # A tibble: 4 × 2
## match_type n
## <chr> <int>
## 1 0 - Unmatched 477
## 2 1 - ID match 1554
## 3 2 - Name match 1263
## 4 3 - Name-Season match 100
85.95% of data is matched - call that a success (for now).
check_matches %>%
select(match_type, success_v1) %>%
table() %>%
prop.table(margin = 1)
## success_v1
## match_type 0 1
## 0 - Unmatched 0.4109015 0.5890985
## 1 - ID match 0.3063063 0.6936937
## 2 - Name match 0.4117181 0.5882819
## 3 - Name-Season match 0.3700000 0.6300000
check_matches %>% select(success_v1) %>% table() %>% prop.table()
## success_v1
## 0 1
## 0.3621096 0.6378904
extra_cols <- Reduce(union, list(
setdiff(names(merged_df_id), names(merged_df_name)),
setdiff(names(merged_df_id), names(merged_df_name_season)),
setdiff(names(merged_df_name), names(merged_df_id)),
setdiff(names(merged_df_name), names(merged_df_name_season)),
setdiff(names(merged_df_name_season), names(merged_df_id)),
setdiff(names(merged_df_name_season), names(merged_df_name))
))
merged_df_id <- merged_df_id %>%
select(-any_of(extra_cols))
merged_df_name <- merged_df_name %>%
select(-any_of(extra_cols))
merged_df_name_season <- merged_df_name_season %>%
select(-any_of(extra_cols))
final_merged_df <- bind_rows(merged_df_id,
merged_df_name,
merged_df_name_season)
write_csv(final_merged_df, "data processing/combined_analytic.csv")
final_merged_df %>%
count(gsis_id) %>%
filter(n > 1)
## # A tibble: 0 × 2
## # ℹ 2 variables: gsis_id <chr>, n <int>
No duplicates!
rm(list = ls())
combined <- read_csv("data processing/combined_analytic.csv")
team_wp <- read_csv("data processing/nfl_team_win_percentage.csv")
combined <- combined %>%
select(-c("espn_id", "name_id", "pfr_player_id",
"name_season_id", "year", "player", "first_name", "last_name",
"match_type"))
gg_miss_var(combined) +
labs(title = "Missing values in analytic file data")
gg_miss_var(team_wp) +
labs(title = "Missing values in Team WP data")
draft_team to abbreviations to match
primary_roster_teamcombined <- combined %>%
mutate(draft_team = case_when(
draft_team == "GNB" ~ "GB",
draft_team == "DEN" ~ "DEN",
draft_team == "CAR" ~ "CAR",
draft_team == "BAL" ~ "BAL",
draft_team == "SFO" ~ "SF",
draft_team == "WAS" ~ "WAS",
draft_team == "HOU" ~ "HOU",
draft_team == "ATL" ~ "ATL",
draft_team == "PHI" ~ "PHI",
draft_team == "IND" ~ "IND",
draft_team == "DET" ~ "DET",
draft_team == "DAL" ~ "DAL",
draft_team == "SEA" ~ "SEA",
draft_team == "JAX" ~ "JAX",
draft_team == "PIT" ~ "PIT",
draft_team == "CLE" ~ "CLE",
draft_team == "KAN" ~ "KC",
draft_team == "NOR" ~ "NO",
draft_team == "TEN" ~ "TEN",
draft_team == "ARI" ~ "ARI",
draft_team == "MIN" ~ "MIN",
draft_team == "LAR" ~ "LA",
draft_team == "TAM" ~ "TB",
draft_team == "LVR" ~ "LV",
draft_team == "LAC" ~ "LAC",
draft_team == "NYG" ~ "NYG",
draft_team == "CHI" ~ "CHI",
draft_team == "BUF" ~ "BUF",
draft_team == "CIN" ~ "CIN",
draft_team == "NYJ" ~ "NYJ",
draft_team == "MIA" ~ "MIA",
draft_team == "NWE" ~ "NE",
draft_team == "SDG" ~ "LAC",
draft_team == "STL" ~ "LA",
draft_team == "OAK" ~ "LV",
TRUE ~ draft_team
))
combined <- combined %>%
mutate(draft_team = ifelse(is.na(draft_team), "UDFA", draft_team),
draft_round = ifelse(is.na(draft_round), "Undrafted", draft_round))
combined <- combined %>%
mutate(different_position = ifelse(position != position_college, 1, 0),
changed_teams = ifelse(primary_roster_team != draft_team, 1, 0))
combined <- combined %>%
mutate(draft_round_cat = case_when(
draft_round %in% c(1) ~ "Round 1",
draft_round %in% c(2,3) ~ "Rounds 2-3",
draft_round %in% c(4,5,6,7) ~ "Rounds 4-7",
TRUE ~ "Undrafted"
))
combined <- combined %>%
mutate(prev_season = season - 1) %>%
left_join(team_wp %>% select(team, season, win_percentage),
by = c("draft_team" = "team", "prev_season" = "season")) %>%
rename(draft_team_wp_prev = win_percentage) %>%
left_join(team_wp %>% select(team, season, win_percentage),
by = c("primary_roster_team" = "team", "prev_season" = "season")) %>%
rename(roster_team_wp_prev = win_percentage) %>%
select(-prev_season)
count(combined, is.na(roster_team_wp_prev))
## # A tibble: 1 × 2
## `is.na(roster_team_wp_prev)` n
## <lgl> <int>
## 1 FALSE 2917
All players have a primary roster team winning percentage so can use that without any concerns.
write_csv(combined, "data processing/full_data.csv")
rm(team_wp)
ggplot(combined, aes(x = num_schools)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of Number of Schools Attended",
x = "Number of Schools",
y = "Count of Players") +
theme_minimal()
ggplot(combined, aes(x = position_group)) +
geom_bar(stat = "count", fill = "skyblue", color = "black") +
labs(title = "Distribution of College Positions",
x = "Position Group",
y = "Count of Players") +
theme_minimal()
ggplot(combined, aes(x = as.factor(success_v1))) +
geom_bar(fill = "skyblue", color = "black") +
labs(title = "Distribution of Basic Success for Rookies",
x = "Active for more than half the season (1) or not (0)",
y = "Count of Players") +
theme_minimal()
numeric_vars <- combined %>%
select(where(is.numeric)) %>%
names()
for (var in numeric_vars) {
p <- ggplot(combined, aes_string(x = var)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = paste("Distribution of", var),
x = var,
y = "Count of Players") +
theme_minimal()
print(p)
}
combined_for_analysis <- combined %>%
filter(season != 2025) %>%
select(-c("gsis_id", "full_name")) %>%
mutate(primary_roster_team = as.factor(primary_roster_team),
multiple_teams = as.factor(multiple_teams),
success_v1 = as.factor(success_v1),
num_schools = as.factor(num_schools),
side_of_ball = as.factor(side_of_ball),
position_group = as.factor(position_group),
conference = as.factor(conference),
div_level = as.factor(div_level) %>% relevel("D1 FBS"),
comp_level = as.factor(comp_level) %>% relevel("P5"),
pos_skill_flag = as.factor(pos_skill_flag),
position = as.factor(position),
position_college = as.factor(position_college),
different_position = as.factor(different_position),
drafted = as.factor(drafted),
draft_team = as.factor(draft_team),
changed_teams = as.factor(changed_teams),
draft_round = as.factor(draft_round) %>% relevel("Undrafted"),
draft_round_cat = as.factor(draft_round_cat) %>% relevel("Undrafted")
)
bivariate_cols <- colnames(combined_for_analysis)
numeric_cols <- combined_for_analysis %>%
select(all_of(bivariate_cols)) %>%
select(where(is.numeric)) %>%
colnames()
cat_cols <- combined_for_analysis %>%
select(all_of(bivariate_cols)) %>%
select(!any_of(numeric_cols)) %>%
colnames()
combined_for_analysis %>%
group_by(season) %>%
summarize(success_rate = mean(success_v1 == 1),
n = n()) %>%
ggplot(aes(x = season, y = success_rate)) +
geom_point() +
geom_smooth() +
theme_classic()
combined_for_analysis %>%
group_by(college_year) %>%
summarize(success_rate = mean(success_v1 == 1),
n = n()) %>%
ggplot(aes(x = college_year, y = success_rate)) +
geom_point() +
geom_smooth() +
theme_classic()
combined_for_analysis %>%
group_by(draft_pick) %>%
summarize(success_rate = mean(success_v1 == 1),
n = n()) %>%
ggplot(aes(x = draft_pick, y = success_rate)) +
geom_point() +
geom_smooth() +
theme_classic()
combined_for_analysis %>%
group_by(age) %>%
summarize(success_rate = mean(success_v1 == 1),
n = n()) %>%
ggplot(aes(x = age, y = success_rate)) +
geom_point() +
geom_smooth() +
theme_classic()
combined_for_analysis %>%
group_by(dr_av) %>%
summarize(success_rate = mean(success_v1 == 1),
n = n()) %>%
ggplot(aes(x = dr_av, y = success_rate)) +
geom_point() +
geom_smooth() +
theme_classic()
combined_for_analysis %>%
group_by(draft_team_wp_prev) %>%
summarize(success_rate = mean(success_v1 == 1),
n = n()) %>%
ggplot(aes(x = draft_team_wp_prev, y = success_rate)) +
geom_point() +
geom_smooth() +
theme_classic()
combined_for_analysis %>%
group_by(roster_team_wp_prev) %>%
summarize(success_rate = mean(success_v1 == 1),
n = n()) %>%
ggplot(aes(x = roster_team_wp_prev, y = success_rate)) +
geom_point() +
geom_smooth() +
theme_classic()
*Note: didn’t look at weeks_active vs
success_v1 because success_v1 is calculated
from weeks_active.
ggplot(combined_for_analysis, aes(x = primary_roster_team, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = multiple_teams, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = changed_teams, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = different_position, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = position, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = position_college, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = position_group, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = pos_skill_flag, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = draft_round, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = draft_round_cat, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = draft_team, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = drafted, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = team, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = conference, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = div_level, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = comp_level, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = num_schools, fill = success_v1)) +
geom_bar(position = "fill") +
labs(y = "Proportion") +
theme_classic() +
guides(x = guide_axis(angle = 90))
ggplot(combined_for_analysis, aes(x = weeks_active)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
geom_vline(xintercept = 8.5,
color = "darkblue", linetype = "dashed", linewidth = 1) +
labs(title = "Distribution of Weeks Active",
subtitle = "For NFL Rookies",
x = "Weeks on Active Roster",
y = "Count of Players") +
theme(plot.subtitle = element_text(size = 5)) +
theme_minimal()
prop_df <- combined_for_analysis %>%
count(drafted, success_v1) %>%
group_by(drafted) %>%
mutate(prop = n / sum(n),
prop_round = paste0(round(prop, 3) * 100, "%"))
ggplot(prop_df, aes(x = factor(drafted), y = prop, fill = factor(success_v1))) +
geom_col(position = "fill") +
geom_text(aes(label = ifelse(success_v1 == 1, prop_round, "")),
position = position_stack(vjust = 0.9),
color = "black",
size = 3.5
) +
scale_fill_manual(
name = "Successful?",
values = c("#D3D3D3", "#87CEEB"),
labels = c("No", "Yes")
) +
labs(
x = "Drafted (1 = Yes)",
y = "Proportion",
title = "NFL Rookie Draft Status and Success Rate") +
theme_minimal()
ggplot(combined_for_analysis, aes(x = changed_teams, fill = success_v1)) +
geom_bar(position = "fill") +
scale_fill_manual(
name = "Successful?",
values = c("#D3D3D3", "#87CEEB"),
labels = c("No", "Yes")
) +
labs(
x = "Changed Teams (1 = Yes)",
y = "Proportion",
title = "NFL Rookies Switching Teams and Success Rate",
subtitle = "Have a different primary roster team from their draft team") +
theme(plot.subtitle = element_text(size = 5)) +
theme_minimal()
combined_for_analysis %>%
group_by(age) %>%
summarize(success_rate = mean(success_v1 == 1),
n = n()) %>%
ggplot(aes(x = age, y = success_rate)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Age (at draft)",
y = "Success Rate",
title = "Player Age vs Average Success Rate") +
theme(plot.subtitle = element_text(size = 5)) +
theme_classic()
combined_for_analysis %>%
group_by(age) %>%
filter(age < 26) %>%
summarize(success_rate = mean(success_v1 == 1),
n = n()) %>%
ggplot(aes(x = age, y = success_rate)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Age (at draft)",
y = "Success Rate",
title = "Player Age vs Average Success Rate",
subtitle = "After Dropping Rookies Older than 25") +
theme(plot.subtitle = element_text(size = 5)) +
theme_classic()
rm(list = ls())
data <- read_csv("data processing/full_data.csv")
model_data <- data %>%
filter(season != 2025) %>%
select(-c("gsis_id", "weeks_active", "full_name", "position",
"college_year", "season", "team", "conference",
"position_college", "draft_pick", "draft_team_wp_prev",
"draft_team")) %>%
mutate(primary_roster_team = as.factor(primary_roster_team),
multiple_teams = as.factor(multiple_teams),
success_v1 = as.factor(success_v1),
num_schools = as.factor(num_schools),
side_of_ball = as.factor(side_of_ball),
position_group = as.factor(position_group),
div_level = as.factor(div_level) %>% relevel("D1 FBS"),
comp_level = as.factor(comp_level) %>% relevel("P5"),
pos_skill_flag = as.factor(pos_skill_flag),
different_position = as.factor(different_position),
drafted = as.factor(drafted),
changed_teams = as.factor(changed_teams),
draft_round = as.factor(draft_round) %>% relevel("Undrafted"),
draft_round_cat = as.factor(draft_round_cat) %>% relevel("Undrafted")
)
model_data <- model_data %>%
mutate(dr_av_missing = as.factor(ifelse(is.na(dr_av), 1, 0)),
age_missing = as.factor(ifelse(is.na(age), 1, 0)))
avg_dr_av_pos_round <- model_data %>%
group_by(position_group, draft_round) %>%
summarize(avg_dr_av = mean(dr_av, na.rm = TRUE), .groups = "drop") %>%
mutate(avg_dr_av = round(avg_dr_av))
avg_dr_av_pos <- model_data %>%
group_by(position_group) %>%
summarize(avg_dr_av = mean(dr_av, na.rm = TRUE), .groups = "drop") %>%
mutate(avg_dr_av = round(avg_dr_av))
avg_age <- model_data %>%
summarize(avg_age = mean(age, na.rm = TRUE), .groups = "drop") %>%
mutate(avg_age = round(avg_age)) %>%
pull()
model_data_imp <- model_data %>%
left_join(avg_dr_av_pos_round, by = c("position_group", "draft_round"),
suffix = c("", "_pos_round")) %>%
left_join(avg_dr_av_pos, by = "position_group",
suffix = c("", "_pos"))
model_data_imp <- model_data_imp %>%
mutate(dr_av_imputed = if_else(is.na(dr_av),
coalesce(.data$avg_dr_av, .data$avg_dr_av_pos),
dr_av)) %>%
select(-avg_dr_av, -avg_dr_av_pos)
model_data_imp <- model_data_imp %>%
mutate(dr_av = dr_av_imputed,
age = ifelse(is.na(age), avg_age, age)) %>%
select(-dr_av_imputed) %>%
filter(age != 26, age!= 27)
save(avg_dr_av_pos, avg_dr_av_pos_round, avg_age,
file = "data processing/avg_imputation.Rdata")
rm(avg_dr_av_pos, avg_dr_av_pos_round, avg_age)
rf_model_data <- model_data_imp %>%
mutate(across(where(is.factor),
~ fct_lump_min(.x, min = 20))) %>%
select(-c("age_missing", "changed_teams", "dr_av_missing", "drafted",
"draft_round_cat", "position_group"))
gg_miss_var(rf_model_data)
set.seed(22)
split = sample.split(rf_model_data$success_v1, SplitRatio = 0.8)
set.seed(22)
rf <- randomForest(success_v1 ~., data = rf_model_data[split,],
ntree = 501,
importance = TRUE)
print(rf)
##
## Call:
## randomForest(formula = success_v1 ~ ., data = rf_model_data[split, ], ntree = 501, importance = TRUE)
## Type of random forest: classification
## Number of trees: 501
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 27.71%
## Confusion matrix:
## 0 1 class.error
## 0 374 367 0.4952767
## 1 218 1152 0.1591241
plot(rf)
rf_class <- predict(rf, rf_model_data[!split,], type = "response")
fscore <- confusionMatrix(rf_class, rf_model_data$success_v1[!split],
positive = "1")$byClass["F1"]
rf$importance
## 0 1 MeanDecreaseAccuracy
## primary_roster_team 0.0095610687 -0.0020081069 2.064763e-03
## multiple_teams -0.0013777336 0.0007205110 -1.524129e-05
## draft_round 0.1567007927 0.0298231898 7.429152e-02
## age 0.0268718131 -0.0070111733 4.833753e-03
## dr_av 0.0740806418 -0.0071368106 2.135590e-02
## num_schools 0.0035957443 0.0006025060 1.665904e-03
## side_of_ball 0.0084920100 0.0018649977 4.187462e-03
## div_level -0.0017517056 0.0012444771 1.945249e-04
## comp_level 0.0006484189 0.0011648535 9.680349e-04
## pos_skill_flag 0.0020695524 -0.0016032128 -3.214563e-04
## different_position 0.0049387605 -0.0001451474 1.650934e-03
## roster_team_wp_prev 0.0128688407 -0.0071043682 -1.022610e-04
## MeanDecreaseGini
## primary_roster_team 221.632969
## multiple_teams 6.847433
## draft_round 185.247973
## age 38.769534
## dr_av 132.836073
## num_schools 16.940535
## side_of_ball 27.407014
## div_level 6.344361
## comp_level 28.415333
## pos_skill_flag 35.613748
## different_position 19.467025
## roster_team_wp_prev 94.884539
varImpPlot(rf)
OOB Error is 27.71% and F-score of 0.785.
mtry parameterm <- seq(2, 12)
fscore.seq <- numeric()
for(i in 1:length(m)){
set.seed(22)
rf <- randomForest(success_v1 ~., data = rf_model_data[split,],
ntree = 501,
importance = TRUE,
mtry = m[i])
rf_class <- predict(rf, newdata = rf_model_data[!split,], type = "response")
fscore.seq[i] <- confusionMatrix(rf_class, rf_model_data$success_v1[!split],
positive = "1")$byClass["F1"]
}
plot(m, fscore.seq, pch = 19 , col="blue", type = "b",
ylab = "F-score", xlab = "Number of Predictors considered at each split")
m_best <- m[which.max(fscore.seq)]
rm(fscore.seq, i, m)
ntree parameterntrees <- c(101, 251, 501, 751, 1001)
fscore.seq <- numeric()
for(i in seq_along(ntrees)){
set.seed(22)
rf <- randomForest(success_v1 ~., data = rf_model_data[split,],
ntree = ntrees[i],
importance = TRUE,
mtry = m_best)
rf_class <- predict(rf, newdata = rf_model_data[!split,], type = "response")
fscore.seq[i] <- confusionMatrix(rf_class, rf_model_data$success_v1[!split],
positive = "1")$byClass["F1"]
}
plot(ntrees, fscore.seq, pch = 19 , col="blue", type = "b",
ylab = "F-score", xlab = "Number of Trees used in Random Forest")
ntrees_best <- ntrees[which.max(fscore.seq)]
rm(ntrees, fscore.seq)
set.seed(22)
rf_final <- randomForest(success_v1 ~., data = rf_model_data[split,],
ntree = ntrees_best, mtry = m_best, importance = TRUE)
print(rf_final)
##
## Call:
## randomForest(formula = success_v1 ~ ., data = rf_model_data[split, ], ntree = ntrees_best, mtry = m_best, importance = TRUE)
## Type of random forest: classification
## Number of trees: 501
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 27.66%
## Confusion matrix:
## 0 1 class.error
## 0 365 376 0.5074224
## 1 208 1162 0.1518248
plot(rf_final)
rf_final_class <- predict(rf_final, newdata = rf_model_data[!split,],
type = "response")
confusionMatrix(rf_final_class, rf_model_data$success_v1[!split])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 97 57
## 1 88 286
##
## Accuracy : 0.7254
## 95% CI : (0.6852, 0.763)
## No Information Rate : 0.6496
## P-Value [Acc > NIR] : 0.0001223
##
## Kappa : 0.3725
##
## Mcnemar's Test P-Value : 0.0127254
##
## Sensitivity : 0.5243
## Specificity : 0.8338
## Pos Pred Value : 0.6299
## Neg Pred Value : 0.7647
## Prevalence : 0.3504
## Detection Rate : 0.1837
## Detection Prevalence : 0.2917
## Balanced Accuracy : 0.6791
##
## 'Positive' Class : 0
##
fscore <- confusionMatrix(rf_final_class, rf_model_data$success_v1[!split],
positive = "1")$byClass["F1"]
rf_final$importance
## 0 1 MeanDecreaseAccuracy
## primary_roster_team 0.0119026592 -3.236297e-03 2.076146e-03
## multiple_teams -0.0010087311 4.531214e-04 -5.454426e-05
## draft_round 0.1613960468 2.768474e-02 7.436701e-02
## age 0.0231650582 -7.049128e-03 3.543891e-03
## dr_av 0.0797720558 -9.357569e-03 2.178815e-02
## num_schools 0.0045641444 -5.612973e-05 1.563106e-03
## side_of_ball 0.0083911465 2.494246e-03 4.557601e-03
## div_level -0.0013153925 9.716571e-04 1.674719e-04
## comp_level 0.0030798867 6.172516e-04 1.492653e-03
## pos_skill_flag -0.0001094845 -8.088085e-04 -5.821274e-04
## different_position 0.0050036070 -6.170856e-04 1.362034e-03
## roster_team_wp_prev 0.0143345011 -7.327268e-03 2.717310e-04
## MeanDecreaseGini
## primary_roster_team 241.841521
## multiple_teams 7.151242
## draft_round 196.427442
## age 37.118685
## dr_av 143.217668
## num_schools 18.059708
## side_of_ball 31.197995
## div_level 7.035268
## comp_level 32.236374
## pos_skill_flag 41.250450
## different_position 22.809933
## roster_team_wp_prev 112.637920
varImpPlot(rf_final)
OOB Error is 27.66% and F-score of 0.798, showing slight improvement. Accuracy (72.54%) > NIR (64.96%)
var_imp <- rf_final$importance
top_features <- rownames(var_imp[order(var_imp[, "MeanDecreaseAccuracy"],
decreasing = TRUE), ])[1:8]
cramers_df <- combn(top_features, 2, simplify = FALSE) %>%
map_dfr(~ data.frame(
var1 = .x[1],
var2 = .x[2],
cramersV = cramersV(
rf_model_data[[.x[1]]],
rf_model_data[[.x[2]]]
)
))
cramers_df <- cramers_df %>%
filter(cramersV > 0.6)
appearance_counts <- cramers_df %>%
pivot_longer(cols = c(var1, var2),
names_to = "side",
values_to = "variable") %>%
count(variable, sort = TRUE)
cramers_df
## [1] var1 var2 cramersV
## <0 rows> (or 0-length row.names)
appearance_counts
## # A tibble: 0 × 2
## # ℹ 2 variables: variable <chr>, n <int>
save(rf_model_data, m_best, ntrees_best, rf_final, top_features, split,
file = "data processing/RF_results.Rdata")
rm(list = ls())
load("data processing/RF_results.Rdata")
imp_formula <- as.formula(paste("success_v1 ~",
paste(top_features, collapse = " + ")))
set.seed(22)
imp_model <- glm(imp_formula, family = binomial, data = rf_model_data[split,])
summary(imp_model)
##
## Call:
## glm(formula = imp_formula, family = binomial, data = rf_model_data[split,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.01754 1.97609 1.021 0.30727
## draft_round1 2.45722 0.27990 8.779 < 2e-16 ***
## draft_round2 2.86060 0.28367 10.084 < 2e-16 ***
## draft_round3 3.19032 0.26549 12.017 < 2e-16 ***
## draft_round4 2.52897 0.22529 11.225 < 2e-16 ***
## draft_round5 1.97794 0.20470 9.663 < 2e-16 ***
## draft_round6 2.03173 0.21447 9.473 < 2e-16 ***
## draft_round7 1.48468 0.22170 6.697 2.13e-11 ***
## dr_av 0.07057 0.01240 5.689 1.27e-08 ***
## side_of_ballOff -0.10664 0.11933 -0.894 0.37153
## side_of_ballST 1.63365 0.33504 4.876 1.08e-06 ***
## age -0.12776 0.08776 -1.456 0.14545
## primary_roster_teamATL -0.14039 0.50614 -0.277 0.78149
## primary_roster_teamBAL -1.21847 0.44903 -2.714 0.00666 **
## primary_roster_teamBUF -1.08238 0.49047 -2.207 0.02733 *
## primary_roster_teamCAR -0.35968 0.45737 -0.786 0.43163
## primary_roster_teamCHI -0.33592 0.46942 -0.716 0.47424
## primary_roster_teamCIN -0.08117 0.45455 -0.179 0.85827
## primary_roster_teamCLE -0.46133 0.44313 -1.041 0.29784
## primary_roster_teamDAL -0.05129 0.45719 -0.112 0.91068
## primary_roster_teamDEN -0.67843 0.44594 -1.521 0.12818
## primary_roster_teamDET -1.06850 0.45896 -2.328 0.01991 *
## primary_roster_teamGB -0.51799 0.43200 -1.199 0.23051
## primary_roster_teamHOU -0.09955 0.47386 -0.210 0.83361
## primary_roster_teamIND -0.77103 0.44339 -1.739 0.08204 .
## primary_roster_teamJAX -0.91320 0.43406 -2.104 0.03539 *
## primary_roster_teamKC 0.08289 0.50197 0.165 0.86883
## primary_roster_teamLA 0.21756 0.44347 0.491 0.62372
## primary_roster_teamLAC -0.52243 0.43782 -1.193 0.23277
## primary_roster_teamLV -0.21396 0.44254 -0.483 0.62875
## primary_roster_teamMIA -0.28732 0.45612 -0.630 0.52875
## primary_roster_teamMIN -0.48250 0.44796 -1.077 0.28144
## primary_roster_teamNE -0.52072 0.46853 -1.111 0.26640
## primary_roster_teamNO -0.57211 0.46241 -1.237 0.21600
## primary_roster_teamNYG -0.11842 0.44879 -0.264 0.79188
## primary_roster_teamNYJ -0.19157 0.45844 -0.418 0.67603
## primary_roster_teamPHI -0.30919 0.46232 -0.669 0.50363
## primary_roster_teamPIT -0.50761 0.51742 -0.981 0.32657
## primary_roster_teamSEA -0.37580 0.46627 -0.806 0.42026
## primary_roster_teamSF -0.19117 0.46209 -0.414 0.67908
## primary_roster_teamTB -0.13320 0.46512 -0.286 0.77459
## primary_roster_teamTEN -0.68994 0.45957 -1.501 0.13329
## primary_roster_teamWAS -0.18134 0.43870 -0.413 0.67935
## num_schools2 -0.19437 0.16186 -1.201 0.22981
## num_schoolsOther -1.54158 0.84311 -1.828 0.06748 .
## comp_levelFCS 0.24103 0.27321 0.882 0.37765
## comp_levelFCS Power -0.05548 0.29594 -0.187 0.85130
## comp_levelG5 -0.11964 0.13142 -0.910 0.36262
## different_position1 0.02404 0.12954 0.186 0.85277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2736.2 on 2110 degrees of freedom
## Residual deviance: 2109.6 on 2062 degrees of freedom
## AIC: 2207.6
##
## Number of Fisher Scoring iterations: 6
vif(imp_model) ## ALL UNDER 3
## GVIF Df GVIF^(1/(2*Df))
## draft_round 2.047566 7 1.052522
## dr_av 1.516863 1 1.231610
## side_of_ball 1.288500 2 1.065421
## age 1.319426 1 1.148663
## primary_roster_team 1.358856 31 1.004958
## num_schools 1.081476 2 1.019775
## comp_level 1.162246 3 1.025376
## different_position 1.206517 1 1.098415
lr_probs <- predict(imp_model, rf_model_data[!split,], type = "response")
lr_class <- ifelse(lr_probs > 0.5, 1, 0)
lr_misc <- 1 - mean(lr_class == rf_model_data$success_v1[!split])
lr_cm <- table(rf_model_data[!split,]$success_v1, lr_class)
tp <- lr_cm["1", "1"]
fp <- lr_cm["0", "1"]
fn <- lr_cm["1", "0"]
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
f1 <- 2 * precision * recall / (precision + recall)
rm(tp, fp, fn)
All VIFs under 2.5, Misclassification rate of 29.55%, Accuracy of 70.45%, F1 of 0.7672.
full <- glm(imp_formula, family = binomial, data = rf_model_data[split,])
null <- glm(success_v1 ~ 1, family = binomial, data = rf_model_data[split,])
step_imp_model <- step(null, scope = formula(full),
direction = "both", k = 2)
## Start: AIC=2738.17
## success_v1 ~ 1
##
## Df Deviance AIC
## + draft_round 7 2227.8 2243.8
## + dr_av 1 2690.4 2694.4
## + comp_level 3 2702.1 2710.1
## + num_schools 2 2720.7 2726.7
## + side_of_ball 2 2728.9 2734.9
## <none> 2736.2 2738.2
## + age 1 2736.1 2740.1
## + different_position 1 2736.2 2740.2
## + primary_roster_team 31 2699.6 2763.6
##
## Step: AIC=2243.75
## success_v1 ~ draft_round
##
## Df Deviance AIC
## + dr_av 1 2189.8 2207.8
## + side_of_ball 2 2206.4 2226.4
## + age 1 2222.2 2240.2
## + num_schools 2 2221.7 2241.7
## <none> 2227.8 2243.8
## + different_position 1 2226.8 2244.8
## + comp_level 3 2225.7 2247.7
## + primary_roster_team 31 2191.2 2269.2
## - draft_round 7 2736.2 2738.2
##
## Step: AIC=2207.77
## success_v1 ~ draft_round + dr_av
##
## Df Deviance AIC
## + side_of_ball 2 2159.6 2181.6
## + num_schools 2 2184.4 2206.4
## + age 1 2187.2 2207.2
## <none> 2189.8 2207.8
## + different_position 1 2187.8 2207.8
## + comp_level 3 2187.6 2211.6
## + primary_roster_team 31 2151.4 2231.4
## - dr_av 1 2227.8 2243.8
## - draft_round 7 2690.4 2694.4
##
## Step: AIC=2181.59
## success_v1 ~ draft_round + dr_av + side_of_ball
##
## Df Deviance AIC
## + num_schools 2 2154.2 2180.2
## + age 1 2157.2 2181.2
## <none> 2159.6 2181.6
## + different_position 1 2159.5 2183.5
## + comp_level 3 2157.6 2185.6
## + primary_roster_team 31 2119.9 2203.9
## - side_of_ball 2 2189.8 2207.8
## - dr_av 1 2206.4 2226.4
## - draft_round 7 2678.8 2686.8
##
## Step: AIC=2180.25
## success_v1 ~ draft_round + dr_av + side_of_ball + num_schools
##
## Df Deviance AIC
## <none> 2154.2 2180.2
## + age 1 2152.5 2180.5
## - num_schools 2 2159.6 2181.6
## + different_position 1 2154.2 2182.2
## + comp_level 3 2152.2 2184.2
## + primary_roster_team 31 2113.8 2201.8
## - side_of_ball 2 2184.4 2206.4
## - dr_av 1 2200.5 2224.5
## - draft_round 7 2666.1 2678.1
summary(step_imp_model)
##
## Call:
## glm(formula = success_v1 ~ draft_round + dr_av + side_of_ball +
## num_schools, family = binomial, data = rf_model_data[split,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.19426 0.15248 -7.832 4.80e-15 ***
## draft_round1 2.43614 0.27270 8.934 < 2e-16 ***
## draft_round2 2.81228 0.27750 10.134 < 2e-16 ***
## draft_round3 3.07013 0.25931 11.840 < 2e-16 ***
## draft_round4 2.43049 0.21501 11.304 < 2e-16 ***
## draft_round5 1.89147 0.19369 9.766 < 2e-16 ***
## draft_round6 1.92003 0.20249 9.482 < 2e-16 ***
## draft_round7 1.39396 0.20695 6.736 1.63e-11 ***
## dr_av 0.07051 0.01216 5.798 6.70e-09 ***
## side_of_ballOff -0.11520 0.10891 -1.058 0.2901
## side_of_ballST 1.56227 0.32549 4.800 1.59e-06 ***
## num_schools2 -0.16948 0.15663 -1.082 0.2792
## num_schoolsOther -1.54079 0.82608 -1.865 0.0622 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2736.2 on 2110 degrees of freedom
## Residual deviance: 2154.2 on 2098 degrees of freedom
## AIC: 2180.2
##
## Number of Fisher Scoring iterations: 6
fwd_imp_model <- step(null, scope = formula(full),
direction = "forward", k = 2, trace = FALSE)
summary(fwd_imp_model)
##
## Call:
## glm(formula = success_v1 ~ draft_round + dr_av + side_of_ball +
## num_schools, family = binomial, data = rf_model_data[split,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.19426 0.15248 -7.832 4.80e-15 ***
## draft_round1 2.43614 0.27270 8.934 < 2e-16 ***
## draft_round2 2.81228 0.27750 10.134 < 2e-16 ***
## draft_round3 3.07013 0.25931 11.840 < 2e-16 ***
## draft_round4 2.43049 0.21501 11.304 < 2e-16 ***
## draft_round5 1.89147 0.19369 9.766 < 2e-16 ***
## draft_round6 1.92003 0.20249 9.482 < 2e-16 ***
## draft_round7 1.39396 0.20695 6.736 1.63e-11 ***
## dr_av 0.07051 0.01216 5.798 6.70e-09 ***
## side_of_ballOff -0.11520 0.10891 -1.058 0.2901
## side_of_ballST 1.56227 0.32549 4.800 1.59e-06 ***
## num_schools2 -0.16948 0.15663 -1.082 0.2792
## num_schoolsOther -1.54079 0.82608 -1.865 0.0622 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2736.2 on 2110 degrees of freedom
## Residual deviance: 2154.2 on 2098 degrees of freedom
## AIC: 2180.2
##
## Number of Fisher Scoring iterations: 6
bwd_imp_model <- step(full, direction = "backward", k = 2, trace = FALSE)
summary(bwd_imp_model)
##
## Call:
## glm(formula = success_v1 ~ draft_round + dr_av + side_of_ball +
## num_schools, family = binomial, data = rf_model_data[split,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.19426 0.15248 -7.832 4.80e-15 ***
## draft_round1 2.43614 0.27270 8.934 < 2e-16 ***
## draft_round2 2.81228 0.27750 10.134 < 2e-16 ***
## draft_round3 3.07013 0.25931 11.840 < 2e-16 ***
## draft_round4 2.43049 0.21501 11.304 < 2e-16 ***
## draft_round5 1.89147 0.19369 9.766 < 2e-16 ***
## draft_round6 1.92003 0.20249 9.482 < 2e-16 ***
## draft_round7 1.39396 0.20695 6.736 1.63e-11 ***
## dr_av 0.07051 0.01216 5.798 6.70e-09 ***
## side_of_ballOff -0.11520 0.10891 -1.058 0.2901
## side_of_ballST 1.56227 0.32549 4.800 1.59e-06 ***
## num_schools2 -0.16948 0.15663 -1.082 0.2792
## num_schoolsOther -1.54079 0.82608 -1.865 0.0622 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2736.2 on 2110 degrees of freedom
## Residual deviance: 2154.2 on 2098 degrees of freedom
## AIC: 2180.2
##
## Number of Fisher Scoring iterations: 6
get_varimp_df <- function(model, model_name) {
coef_df <- tidy(model, conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(odds_ratio = exp(estimate),
conf.low = exp(conf.low),
conf.high = exp(conf.high),
model = model_name)
}
varimp_df <- bind_rows(
get_varimp_df(imp_model, "Full"),
get_varimp_df(step_imp_model, "Stepwise"),
get_varimp_df(fwd_imp_model, "Forward"),
get_varimp_df(bwd_imp_model, "Backward")
)
ggplot(varimp_df,
aes(x = reorder(term, odds_ratio), y = odds_ratio)) +
geom_col(fill = "steelblue") +
coord_flip() +
facet_wrap(~ model, scales = "free_y") +
labs(
title = "Variable Importance Across Logistic Regression Models",
x = "Feature",
y = "Odds Ratio"
) +
theme_minimal() +
theme(
strip.text = element_text(face = "bold"),
axis.text.y = element_text(size = 9)
)
final_reg_model <- step_imp_model
summary(final_reg_model)
##
## Call:
## glm(formula = success_v1 ~ draft_round + dr_av + side_of_ball +
## num_schools, family = binomial, data = rf_model_data[split,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.19426 0.15248 -7.832 4.80e-15 ***
## draft_round1 2.43614 0.27270 8.934 < 2e-16 ***
## draft_round2 2.81228 0.27750 10.134 < 2e-16 ***
## draft_round3 3.07013 0.25931 11.840 < 2e-16 ***
## draft_round4 2.43049 0.21501 11.304 < 2e-16 ***
## draft_round5 1.89147 0.19369 9.766 < 2e-16 ***
## draft_round6 1.92003 0.20249 9.482 < 2e-16 ***
## draft_round7 1.39396 0.20695 6.736 1.63e-11 ***
## dr_av 0.07051 0.01216 5.798 6.70e-09 ***
## side_of_ballOff -0.11520 0.10891 -1.058 0.2901
## side_of_ballST 1.56227 0.32549 4.800 1.59e-06 ***
## num_schools2 -0.16948 0.15663 -1.082 0.2792
## num_schoolsOther -1.54079 0.82608 -1.865 0.0622 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2736.2 on 2110 degrees of freedom
## Residual deviance: 2154.2 on 2098 degrees of freedom
## AIC: 2180.2
##
## Number of Fisher Scoring iterations: 6
vif(final_reg_model) ## ALL UNDER 2, only 4 vars
## GVIF Df GVIF^(1/(2*Df))
## draft_round 1.514226 7 1.030080
## dr_av 1.492082 1 1.221508
## side_of_ball 1.054453 2 1.013344
## num_schools 1.011185 2 1.002785
rm(list = setdiff(ls(), c("final_reg_model", "rf_model_data", "split")))
final_probs <- predict(final_reg_model, rf_model_data[!split,], type = "response")
pred_class <- ifelse(final_probs > 0.5, 1, 0)
misc_rate <- 1 - mean(pred_class == rf_model_data$success_v1[!split])
cm <- table(rf_model_data[!split,]$success_v1, pred_class)
tp <- cm["1", "1"]
fp <- cm["0", "1"]
fn <- cm["1", "0"]
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
f1 <- 2 * precision * recall / (precision + recall)
rm(tp, fp, fn)
Misclassification rate of 27.84%, Accuracy of 72.16%, F1 of 0.7
save(final_reg_model, file = "data processing/future predictions.Rdata")
plot(final_reg_model)
cooks_d <- cooks.distance(final_reg_model)
cutoff <- 4 / nrow(rf_model_data[split, ])
num_influential <- sum(cooks_d > cutoff, na.rm = TRUE)
print(paste0("Number of Influential Observations (out of 2111): ",
num_influential))
## [1] "Number of Influential Observations (out of 2111): 116"
influencePlot(final_reg_model)
## StudRes Hat CookD
## 16 -3.1478306 0.002529918 0.023604327
## 36 -3.2479772 0.002069272 0.026183771
## 957 -0.9018721 0.159614000 0.007655024
## 1857 1.3666086 0.175580064 0.024471876
## 2021 2.2393826 0.066601427 0.048042772
Not great, but logistic regression is more robust so linearity isn’t as important. I also don’t have any truly continuous variables, so linearity vs log-odds doesn’t need to be checked. Still definitely a limitation of this project.
prob_df <- data.frame(
success_v1 = rf_model_data$success_v1[!split],
lr_probs = final_probs
)
ggplot(prob_df, aes(x = final_probs)) +
geom_density(fill = "#2E86AB") +
labs(
title = "Distribution of Predicted Rookie Success Probabilities",
x = "Predicted Probability",
y = "Count"
) +
theme_minimal(base_size = 13)
ggplot(prob_df, aes(x = final_probs, fill = success_v1)) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = c("0" = "#B0B0B0", "1" = "#2E86AB")) +
labs(
title = "Predicted Probabilities by Actual Outcome",
subtitle = "Success (1) vs Not Success (0)",
x = "Predicted Probability",
fill = ""
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "top"
)
std_coefs <- standardize_parameters(final_reg_model)
ggplot(std_coefs,
aes(x = reorder(Parameter, abs(Std_Coefficient)),
y = Std_Coefficient)) +
geom_col() +
coord_flip() +
labs(
title = "Standardized Coefficients (Relative Importance)",
x = "Feature",
y = "Standardized Effect"
) +
theme_minimal()
coef_df <- tidy(final_reg_model, conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(
odds_ratio = exp(estimate),
conf.low = exp(conf.low),
conf.high = exp(conf.high)
)
ggplot(coef_df,
aes(x = reorder(term, odds_ratio), y = odds_ratio)) +
geom_point(size = 2) +
geom_errorbar(
aes(ymin = conf.low, ymax = conf.high),
width = 0.2
) +
geom_hline(yintercept = 1, linetype = "dashed") +
coord_flip() +
labs(
title = "Logistic Regression Variable Importance",
x = "Feature",
y = "Odds Ratio"
) +
theme_minimal()
prob_df_full <- data.frame(
draft_round = rf_model_data$draft_round[!split],
dr_av = rf_model_data$dr_av[!split],
side_of_ball = rf_model_data$side_of_ball[!split],
num_schools = rf_model_data$num_schools[!split],
final_probs = final_probs
)
ggplot(prob_df_full, aes(x = draft_round, y = side_of_ball,
fill = final_probs)) +
geom_tile(color = "black") +
labs(x = "Draft Round Category", y = "Side of Ball",
fill = "Success Probability") +
theme_classic()
ggplot(prob_df_full, aes(x = draft_round, y = num_schools, fill = final_probs)) +
geom_tile(color = "black") +
labs(x = "Draft Round Category", y = "Number of Colleges",
fill = "Success Probability") +
theme_classic()
ggplot(prob_df_full, aes(x = num_schools, y = side_of_ball, fill = final_probs)) +
geom_tile(color = "black") +
labs(x = "Number of Colleges", y = "Side of Ball",
fill = "Success Probability") +
theme_classic()
### Clean Environment
rm(list = ls())
data <- read_csv("data processing/full_data.csv")
data_for_val <- data %>%
filter(season == 2025) %>%
mutate(primary_roster_team = as.factor(primary_roster_team),
multiple_teams = as.factor(multiple_teams),
success_v1 = as.factor(success_v1),
num_schools = as.factor(num_schools),
side_of_ball = as.factor(side_of_ball),
position_group = as.factor(position_group),
div_level = as.factor(div_level) %>% relevel("D1 FBS"),
comp_level = as.factor(comp_level) %>% relevel("P5"),
pos_skill_flag = as.factor(pos_skill_flag),
different_position = as.factor(different_position),
drafted = as.factor(drafted),
changed_teams = as.factor(changed_teams),
draft_round = as.factor(draft_round) %>% relevel("Undrafted"),
draft_round_cat = as.factor(draft_round_cat) %>% relevel("Undrafted")
)
load("data processing/future predictions.Rdata")
load("data processing/avg_imputation.Rdata")
data_for_val <- data_for_val %>%
mutate(dr_av_missing = as.factor(ifelse(is.na(dr_av), 1, 0)),
age_missing = as.factor(ifelse(is.na(age), 1, 0)))
data_for_val <- data_for_val %>%
left_join(avg_dr_av_pos_round, by = c("position_group", "draft_round"),
suffix = c("", "_pos_round")) %>%
left_join(avg_dr_av_pos, by = "position_group",
suffix = c("", "_pos"))
data_for_val <- data_for_val %>%
mutate(dr_av_imputed = if_else(is.na(dr_av),
coalesce(.data$avg_dr_av, .data$avg_dr_av_pos),
dr_av)) %>%
select(-avg_dr_av, -avg_dr_av_pos)
data_for_val <- data_for_val %>%
mutate(dr_av = dr_av_imputed,
age = ifelse(is.na(age), avg_age, age)) %>%
select(-dr_av_imputed)
gg_miss_var(data_for_val) ## 2 vars, but not being used in my analysis
2 missing variables, but they’re not predictors in the model, so I’m not concerned.
data_for_val$num_schools <- factor(
data_for_val$num_schools,
levels = levels(model.frame(final_reg_model)$num_schools)
)
pred_probs_25 <- predict(final_reg_model, data_for_val,
type = "response")
pred_success_25 <- ifelse(pred_probs_25 > 0.5, 1, 0)
results <- data_for_val %>%
mutate(pred_prob = pred_probs_25)
ggplot(results, aes(x = pred_prob)) +
geom_density(fill = "#4C72B0") +
labs(
title = "Distribution of Predicted Rookie Success Probabilities",
x = "Predicted Probability",
y = "Count"
) +
theme_minimal(base_size = 13)
There are 14 players who predicted NA (they have NA
num_schools), which I believe became NA when releveled to
match model factors -> they’ve attended more than 2 colleges, which
is weird but a growing trend in cfb, so definitely worth investigating
in the future.
checks <- results %>%
filter(!is.na(pred_prob)) %>%
mutate(pred_success = ifelse(pred_prob > 0.5, 1, 0),
hit_success_wk14 = ifelse(pred_success == 1 & weeks_active > 8, 1, 0),
miss_success_wk14 = ifelse(pred_success == 1 & weeks_active < 6, 1, 0),
hit_fail_wk14 = ifelse(pred_success == 0 & weeks_active < 6, 1, 0),
miss_fail_wk14 = ifelse(pred_success == 0 & weeks_active > 8, 1, 0),
tbd_success_wk14 = ifelse(pred_success == 1 & weeks_active < 9 &
weeks_active > 5, 1, 0),
tbd_fail_wk14 = ifelse(pred_success == 0 & weeks_active > 5 &
weeks_active < 9, 1, 0)) %>%
select(primary_roster_team, weeks_active, full_name, position, team,
draft_round, dr_av, side_of_ball, num_schools, pred_prob, pred_success,
hit_success_wk14, miss_success_wk14, hit_fail_wk14, miss_fail_wk14,
tbd_success_wk14, tbd_fail_wk14) %>%
mutate(hit_success_wk14 = as.factor(hit_success_wk14),
miss_success_wk14 = as.factor(miss_success_wk14),
hit_fail_wk14 = as.factor(hit_fail_wk14),
miss_fail_wk14 = as.factor(miss_fail_wk14),
tbd_success_wk14 = as.factor(tbd_success_wk14),
tbd_fail_wk14 = as.factor(tbd_fail_wk14))
ggplot(checks, aes(x = pred_success, y = tbd_success_wk14, fill = pred_prob)) +
geom_tile(color = "black") +
labs(x = "Predicted Success", y = "Still Possible to Succeed",
fill = "Success Probability") +
theme_classic()
count(checks, tbd_success_wk14)
## # A tibble: 2 × 2
## tbd_success_wk14 n
## <fct> <int>
## 1 0 234
## 2 1 26
Not entirely certain of the code logic, but definitely something of interest.
max_prob <- results %>%
arrange(desc(pred_prob)) %>%
slice(1)
min_prob <- results %>%
arrange(pred_prob) %>%
slice(1)
cool_cases <- results %>%
filter(full_name == max_prob$full_name |
full_name == min_prob$full_name |
full_name == "Shedeur Sanders" |
full_name == "Jalen Milroe" |
full_name == "Darius Cooper") %>%
select(full_name, pred_prob, weeks_active, draft_round, side_of_ball, dr_av,
num_schools, dr_av_missing) %>%
arrange(desc(pred_prob))
cool_cases_tbl <- cool_cases %>%
mutate(
pred_prob = percent(pred_prob, accuracy = 0.1),
draft_round = as.character(draft_round),
dr_av_missing = ifelse(dr_av_missing == "1", "Yes", "No")
)
cool_cases_tbl %>%
gt() %>%
gt_theme_pff() %>%
cols_label(
full_name = "Player",
pred_prob = "Predicted Probability",
weeks_active = "Weeks Active",
draft_round = "Draft Round",
side_of_ball = "Side of Ball",
dr_av = "Draft AV",
dr_av_missing = "Draft AV Imputed?",
num_schools = "# Colleges"
) %>%
fmt_markdown(columns = full_name) %>%
tab_header(
title = "'Cool Case' Player Predictions",
) %>%
tab_source_note(
source_note = "Predictions from Logistic Regression Model"
)
| 'Cool Case' Player Predictions | |||||||
| Player | Predicted Probability | Weeks Active | Draft Round | Side of Ball | Draft AV | # Colleges | Draft AV Imputed? |
|---|---|---|---|---|---|---|---|
| Tyler Loop | 90.8% | 14 | 6 | ST | 0 | 1 | No |
| Jalen Milroe | 85.3% | 4 | 3 | Off | 0 | 1 | No |
| Shedeur Sanders | 60.2% | 8 | 5 | Off | 0 | 2 | No |
| Darius Cooper | 35.3% | 10 | Undrafted | Off | 10 | 1 | Yes |
| Ben Yurosek | 27.2% | 9 | Undrafted | Off | 7 | 2 | Yes |
| Predictions from Logistic Regression Model | |||||||